import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from matplotlib import pyplot
from scipy import stats
from pprint import pprint
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer
from missingpy import MissForest
from timeit import default_timer as timer
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", RuntimeWarning)
df = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/2016_Building_Energy_Benchmarking.csv"
)
df_emissions = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/df_emissions.csv"
).drop(columns="Unnamed: 0")
y_emissions = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/y_emissions.csv"
).drop(columns="Unnamed: 0")
df_emissions.describe().T.style.background_gradient(
subset=["mean"], cmap="coolwarm"
).background_gradient(subset=["std"], cmap="coolwarm").background_gradient(
subset=["50%"], cmap="coolwarm"
)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| BuildingAge | 3356.000000 | 0.408725 | 0.285298 | 0.008621 | 0.163793 | 0.353448 | 0.586207 | 1.000000 |
| PropertyGFATotal | 3356.000000 | 0.754631 | 0.059325 | 0.644223 | 0.708094 | 0.738504 | 0.788411 | 1.000000 |
| PropertyGFABuilding(s) | 3356.000000 | 0.756200 | 0.057284 | 0.569975 | 0.711190 | 0.742081 | 0.788584 | 1.000000 |
| LargestPropertyUseTypeGFA | 3356.000000 | 0.751066 | 0.058920 | 0.602759 | 0.706632 | 0.739002 | 0.783606 | 1.000000 |
| SteamUse | 3356.000000 | 0.038439 | 0.192281 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| NaturalGasUse | 3356.000000 | 0.628129 | 0.483376 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| ElectricityUse | 3356.000000 | 0.628129 | 0.483376 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| ENERGYSTARScore | 3356.000000 | 0.667437 | 0.241667 | 0.010000 | 0.540150 | 0.700000 | 0.860000 | 1.000000 |
| T_Distribution Center | 3356.000000 | 0.015793 | 0.124691 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_High-Rise Multifamily | 3356.000000 | 0.031287 | 0.174119 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Hospital | 3356.000000 | 0.002980 | 0.054514 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Hotel | 3356.000000 | 0.022944 | 0.149747 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_K-12 School | 3356.000000 | 0.040524 | 0.197215 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Laboratory | 3356.000000 | 0.002980 | 0.054514 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Large Office | 3356.000000 | 0.051549 | 0.221149 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Low-Rise Multifamily | 3356.000000 | 0.291418 | 0.454483 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Medical Office | 3356.000000 | 0.011621 | 0.107188 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Mid-Rise Multifamily | 3356.000000 | 0.168057 | 0.373973 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Mixed Use Property | 3356.000000 | 0.039333 | 0.194414 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Office | 3356.000000 | 0.000894 | 0.029890 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Other | 3356.000000 | 0.075387 | 0.264055 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Refrigerated Warehouse | 3356.000000 | 0.003576 | 0.059699 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Residence Hall | 3356.000000 | 0.006853 | 0.082513 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Restaurant | 3356.000000 | 0.003576 | 0.059699 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Retail Store | 3356.000000 | 0.027116 | 0.162445 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Self-Storage Facility | 3356.000000 | 0.008343 | 0.090973 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Senior Care Community | 3356.000000 | 0.013409 | 0.115035 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Small- and Mid-Sized Office | 3356.000000 | 0.087008 | 0.281889 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Supermarket / Grocery Store | 3356.000000 | 0.011621 | 0.107188 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_University | 3356.000000 | 0.006853 | 0.082513 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Warehouse | 3356.000000 | 0.055721 | 0.229417 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Worship Facility | 3356.000000 | 0.021156 | 0.143926 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Campus | 3356.000000 | 0.006555 | 0.080712 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Multifamily HR (10+) | 3356.000000 | 0.032777 | 0.178079 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Multifamily LR (1-4) | 3356.000000 | 0.300358 | 0.458482 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Multifamily MR (5-9) | 3356.000000 | 0.172825 | 0.378152 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_NonResidential | 3356.000000 | 0.433552 | 0.495639 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Nonresidential COS | 3356.000000 | 0.025328 | 0.157142 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Nonresidential WA | 3356.000000 | 0.000298 | 0.017262 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_SPS-District K-12 | 3356.000000 | 0.028308 | 0.165875 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
from sklearn import model_selection
X_train_emissions, X_test_emissions, y_train_emissions, y_test_emissions = model_selection.train_test_split(
df_emissions,
np.ravel(y_emissions),
test_size=0.3, # 30% des données dans le jeu de test
shuffle=True)
from autofeat import AutoFeatModel, AutoFeatRegressor, FeatureSelector
AutoFeat est une librairie qui execute des opérations pré-définies sur les colonnes afin d'affiner la capacité prédictive d'un modèle de régression. L'algorithme calcule de nombreuses transformations et combinaisons de features. Ensuite choisis les plus pertinentes en suivant des critères de régularisation, et en fonction du r^2 d'une régression linéaire simple. Nous nous limitions à 2 étapes ici, afin d'éviter tout overfit.
for steps in range(4):
np.random.seed(55)
print("### AutoFeat with %i feateng_steps" % steps)
afreg = AutoFeatRegressor(verbose=1,
feateng_steps=steps,
n_jobs=-1,
max_gb=4,
transformations=("exp", "abs", "sqrt", "^2",
"^3")
)
df_train_emissions = afreg.fit_transform(X_train_emissions,
np.ravel(y_train_emissions))
df_test_emissions = afreg.transform(X_test_emissions)
r2 = afreg.score(X_test_emissions, np.ravel(y_test_emissions))
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(X_test_emissions),
np.ravel(y_test_emissions),
s=2)
plt.title("%i FE steps (R^2: %.4f; %i new features)" %
(steps, r2, len(afreg.new_feat_cols_)))
### AutoFeat with 0 feateng_steps [AutoFeat] The 0 step feature engineering process could generate up to 40 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.00 gb of space. [feateng] Warning: no features generated for max_steps < 1. [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 4.0s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 4.1s remaining: 6.1s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 4.1s remaining: 2.7s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.2s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.2s finished [featsel] 28 features after 5 feature selection runs [featsel] 23 features after correlation filtering [featsel] 20 features after noise filtering [AutoFeat] Final dataframe with 40 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.827934530891982 13.029636 * PropertyGFATotal 2.248871 * T_Hospital 1.613309 * T_Laboratory 1.538795 * ElectricityUse 1.366093 * T_Supermarket / Grocery Store 1.291425 * T_Restaurant -1.080356 * T_Self-Storage Facility 1.071840 * SteamUse -0.933848 * ENERGYSTARScore 0.687284 * T_Senior Care Community -0.680353 * T_Distribution Center 0.673995 * T_Campus -0.588811 * T_Warehouse 0.433497 * T_Hotel 0.427806 * T_Nonresidential COS 0.354205 * T_Other 0.341727 * BuildingAge 0.251722 * T_K-12 School -0.182780 * T_Low-Rise Multifamily -0.164351 * T_Multifamily MR (5-9) [AutoFeat] Final score: 0.7863 ## Final R^2: 0.8040 ### AutoFeat with 1 feateng_steps [AutoFeat] The 1 step feature engineering process could generate up to 200 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.00 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Generated altogether 20 new features in 1 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 2 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.1s [Parallel(n_jobs=-1)]: Batch computation too fast (0.1440s.) Setting batch_size=2. [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.2s remaining: 0.2s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.2s remaining: 0.1s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s finished [featsel] 29 features after 5 feature selection runs [featsel] 25 features after correlation filtering [featsel] 21 features after noise filtering [AutoFeat] Final dataframe with 40 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.702173157432794 12.886090 * PropertyGFATotal 2.242766 * T_Hospital 1.586548 * T_Laboratory 1.545421 * NaturalGasUse 1.338709 * T_Supermarket / Grocery Store 1.258314 * T_Restaurant -1.107176 * T_Self-Storage Facility 1.072421 * SteamUse -0.929834 * ENERGYSTARScore -0.707516 * T_Distribution Center 0.675562 * T_Campus 0.667535 * T_Senior Care Community -0.614781 * T_Warehouse 0.419540 * T_Nonresidential COS 0.412831 * T_Hotel 0.342522 * BuildingAge 0.330572 * T_Other -0.256819 * T_Worship Facility 0.225557 * T_K-12 School -0.210484 * T_Low-Rise Multifamily -0.187433 * T_Multifamily MR (5-9) [AutoFeat] Final score: 0.7869 ## Final R^2: 0.8037 ### AutoFeat with 2 feateng_steps [AutoFeat] The 2 step feature engineering process could generate up to 20100 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.19 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 1288 feature combinations from 1770 original feature tuples - done. [feateng] Generated altogether 1785 new features in 2 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 411 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.6s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.7s remaining: 1.0s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.7s remaining: 0.5s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.5s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.5s finished [featsel] 47 features after 5 feature selection runs [featsel] 40 features after correlation filtering [featsel] 38 features after noise filtering [AutoFeat] Computing 20 new features. [AutoFeat] 20/ 20 new features ...done. [AutoFeat] Final dataframe with 60 feature columns (20 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.152838183233959 11.304254 * PropertyGFATotal 1.928986 * BuildingAge*T_SeniorCareCommunity 1.730657 * SteamUse -1.576493 * T_LowRiseMultifamily*T_NonResidential 1.519428 * BuildingAge**2*T_Restaurant 1.486629 * T_Laboratory 1.453133 * ENERGYSTARScore**3*T_Other 1.389862 * NaturalGasUse 1.355411 * ENERGYSTARScore*T_Hospital 1.197877 * T_Supermarket / Grocery Store -1.192600 * ElectricityUse*SteamUse -1.158375 * T_Self-Storage Facility 1.150525 * T_Hospital -1.125014 * BuildingAge**2*T_DistributionCenter -1.080764 * ENERGYSTARScore*T_Warehouse -0.981637 * BuildingAge**3*T_Other -0.830512 * ENERGYSTARScore*T_DistributionCenter -0.821253 * NaturalGasUse*T_LargeOffice -0.763016 * BuildingAge**3*T_NonResidential -0.718473 * BuildingAge**3*T_Warehouse 0.708819 * LargestPropertyUseTypeGFA**3*exp(BuildingAge) 0.623028 * T_Campus -0.606919 * ENERGYSTARScore -0.555793 * ENERGYSTARScore**3*T_DistributionCenter 0.545723 * BuildingAge**2*T_K12School 0.486679 * T_Restaurant 0.460761 * T_Nonresidential COS 0.443673 * T_Hotel 0.438108 * sqrt(BuildingAge)*ElectricityUse 0.432853 * T_Large Office -0.421948 * ENERGYSTARScore**2*T_NonResidential 0.419896 * T_NonResidential -0.289447 * T_Worship Facility -0.221218 * NaturalGasUse*T_SmallandMidSizedOffice 0.201767 * NaturalGasUse*T_MixedUseProperty -0.176044 * T_Low-Rise Multifamily -0.114453 * T_Mid-Rise Multifamily 0.093876 * T_Other [AutoFeat] Final score: 0.8205 [AutoFeat] Computing 20 new features. [AutoFeat] 20/ 20 new features ...done. [AutoFeat] Computing 20 new features. [AutoFeat] 20/ 20 new features ...done. ## Final R^2: 0.8380 [AutoFeat] Computing 20 new features. [AutoFeat] 20/ 20 new features ...done. ### AutoFeat with 3 feateng_steps [AutoFeat] The 3 step feature engineering process could generate up to 338500 features. [AutoFeat] With 2349 data points this new feature matrix would use about 3.18 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 6595 feature combinations from 1770 original feature tuples - done. [feateng] Step 3: transformation of new features [feateng] Generated 21319 transformed features from 6595 original features - done. [feateng] Generated altogether 29460 new features in 3 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 8048 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 25.5s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 30.5s remaining: 45.8s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 33.5s remaining: 22.3s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 35.8s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 35.8s finished [featsel] 62 features after 5 feature selection runs [featsel] 33 features after correlation filtering [featsel] 29 features after noise filtering [AutoFeat] Computing 28 new features. [AutoFeat] 28/ 28 new features ...done. [AutoFeat] Final dataframe with 68 feature columns (28 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -3.6959550612248067 -3.364231 * (LargestPropertyUseTypeGFA**2 - T_LowRiseMultifamily)**3 2.459194 * (PropertyGFABuildings - T_SelfStorageFacility)**2 -2.135852 * NaturalGasUse**2*PropertyGFATotal**6 -2.114525 * (-LargestPropertyUseTypeGFA + T_DistributionCenter)**3 1.747490 * (-sqrt(ENERGYSTARScore) + PropertyGFATotal**3)**3 1.702286 * Abs(LargestPropertyUseTypeGFA**2 - T_Warehouse) 1.618613 * PropertyGFATotal 1.549116 * (PropertyGFATotal**2 - T_LowRiseMultifamily)**2 1.527909 * Abs(LargestPropertyUseTypeGFA**3 - T_NonresidentialCOS) 1.231209 * exp(SteamUse - T_LowRiseMultifamily) 1.006624 * (PropertyGFATotal**2 - T_MidRiseMultifamily)**2 0.713898 * Abs(PropertyGFATotal**3 - SteamUse) -0.449550 * exp(-ElectricityUse + T_Warehouse) 0.410450 * (NaturalGasUse + T_SupermarketGroceryStore)**2 0.394244 * LargestPropertyUseTypeGFA**3*exp(BuildingAge) -0.356260 * (BuildingAge**3 - T_LowRiseMultifamily)**2 0.343393 * (-sqrt(ENERGYSTARScore) + ElectricityUse)**3 -0.317929 * exp(NaturalGasUse + SteamUse) 0.289272 * (PropertyGFABuildings**3 + T_Campus)**2 -0.248619 * (ENERGYSTARScore**3 - ElectricityUse)**3 0.245728 * (NaturalGasUse + T_Hospital)**3 0.199791 * (NaturalGasUse + T_Restaurant)**3 0.172219 * Abs(NaturalGasUse - T_SmallandMidSizedOffice) 0.162869 * (NaturalGasUse + T_SeniorCareCommunity)**2 0.105483 * exp(NaturalGasUse + T_Hotel) 0.101701 * NaturalGasUse*exp(BuildingAge) 0.101344 * Abs(NaturalGasUse - T_LargeOffice) 0.072453 * exp(NaturalGasUse + T_Other) 0.063575 * exp(ElectricityUse + T_MixedUseProperty) [AutoFeat] Final score: 0.8088 [AutoFeat] Computing 28 new features. [AutoFeat] 28/ 28 new features ...done. [AutoFeat] Computing 28 new features. [AutoFeat] 28/ 28 new features ...done. ## Final R^2: 0.8207 [AutoFeat] Computing 28 new features. [AutoFeat] 28/ 28 new features ...done.
L'auto feature engineering à 2 étapes parait être un bon compromis.
steps = 2
print("### AutoFeat with %i feateng_steps" % steps)
afreg = AutoFeatRegressor(verbose=1,
feateng_steps=steps,
n_jobs=-1,
max_gb=4,
transformations=("exp", "abs", "sqrt", "^2",
"^3"))
X_train_emissions = afreg.fit_transform(
X_train_emissions, np.ravel(y_train_emissions))
X_test_emissions = afreg.transform(X_test_emissions)
r2 = afreg.score(X_test_emissions, np.ravel(y_test_emissions))
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(X_test_emissions), np.ravel(y_test_emissions), s=2)
plt.title("%i FE steps (R^2: %.4f; %i new features)" %
(steps, r2, len(afreg.new_feat_cols_)))
### AutoFeat with 2 feateng_steps [AutoFeat] The 2 step feature engineering process could generate up to 20100 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.19 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 1288 feature combinations from 1770 original feature tuples - done. [feateng] Generated altogether 1785 new features in 2 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 411 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 1.0s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 1.0s remaining: 1.6s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 1.1s remaining: 0.7s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 1.1s finished [featsel] 58 features after 5 feature selection runs [featsel] 48 features after correlation filtering [featsel] 40 features after noise filtering [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. [AutoFeat] Final dataframe with 61 feature columns (21 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -5.7922986120650375 10.940632 * PropertyGFATotal 1.842062 * BuildingAge*T_SeniorCareCommunity 1.691010 * SteamUse -1.601558 * T_LowRiseMultifamily*T_NonResidential 1.384750 * T_Laboratory 1.384473 * BuildingAge**3*T_Restaurant -1.259002 * T_Self-Storage Facility 1.238710 * T_Hospital 1.236762 * ENERGYSTARScore**3*T_Other -1.220146 * ElectricityUse*SteamUse -1.199618 * ENERGYSTARScore**3*T_DistributionCenter 1.161303 * ENERGYSTARScore*T_Hospital 1.159531 * NaturalGasUse 1.156063 * T_Supermarket / Grocery Store -1.078822 * BuildingAge**3*T_Other 0.955014 * T_NonresidentialCOS*T_SmallandMidSizedOffice -0.897900 * ENERGYSTARScore*T_Warehouse -0.843296 * NaturalGasUse*T_LargeOffice -0.829256 * BuildingAge**3*T_NonResidential 0.828740 * SteamUse*T_MixedUseProperty -0.748081 * BuildingAge**2*T_RetailStore 0.745312 * LargestPropertyUseTypeGFA**3*exp(BuildingAge) -0.645460 * ENERGYSTARScore**2*T_WorshipFacility -0.626649 * T_Distribution Center -0.576796 * ENERGYSTARScore 0.532195 * T_Campus -0.526401 * BuildingAge**3*T_Warehouse 0.519190 * T_Restaurant 0.426229 * BuildingAge**2*T_K12School 0.414857 * T_Nonresidential COS -0.383446 * sqrt(BuildingAge)*ENERGYSTARScore**3 -0.370584 * ElectricityUse*T_SmallandMidSizedOffice 0.363069 * T_Hotel 0.337072 * T_Large Office 0.323093 * NaturalGasUse*exp(BuildingAge) 0.277721 * T_NonResidential -0.247708 * T_Warehouse -0.241994 * T_Low-Rise Multifamily -0.166349 * T_Multifamily MR (5-9) 0.094957 * T_Other [AutoFeat] Final score: 0.8226 [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. ## Final R^2: 0.8400
Text(0.5, 1.0, '2 FE steps (R^2: 0.8400; 21 new features)')
Les premiers modèles que nous allons essayer seront des modèles linéares.
from sklearn.linear_model import RidgeCV, LassoCV, Lasso, LassoLarsCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pprint import pprint
def test_model_lineaire_scaled(estimator, X_train, X_test, y_train, y_test):
estimators = [('standardize', StandardScaler()),
(type(estimator).__name__, estimator)]
model_ = Pipeline(estimators)
model_.fit(X_train, np.ravel(y_train))
r2 = model_.score(X_test, np.ravel(y_test))
y_pred = model_.predict(X_test)
y_train_to_test_overfit = model_.predict(X_train)
plt.figure()
plt.scatter(y_pred, np.ravel(y_test), s=2)
plt.title("%s (R^2: %.4f)" %
(type(model_[1]).__name__, r2))
print(type(model_[1]).__name__, "MSE: ",
mean_squared_error(np.ravel(y_test), y_pred))
print(type(model_[1]).__name__, "RMSE: ",
mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print(type(model_[1]).__name__, "alpha", model_[1].alpha_)
importance = model_[1].coef_
print("Score MSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train))
print("Score RMSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score r2 model sur le jeu d'entrainement", r2_score(
np.ravel(y_train_to_test_overfit), y_train))
fig, ax = plt.subplots(figsize=(27, 10), dpi=300)
ax.set_xticks([x for x in range(len(importance))])
ax.set_xticklabels(X_train.columns, rotation=90)
ax.bar(X_train.columns, importance)
plt.show()
return
for model in [
RidgeCV(alphas=np.linspace(0.0001, 200, 2000), scoring='r2'),
LassoCV(alphas=np.linspace(0.0001, 10, 2000), max_iter=2000),
ElasticNetCV(l1_ratio=np.linspace(0.0001, 1, 200),
max_iter=2000,
n_alphas=200)
]:
test_model_lineaire_scaled(model, X_train_emissions, X_test_emissions,
y_train_emissions, y_test_emissions
)
RidgeCV MSE: 0.3706853322995339 RidgeCV RMSE: 0.6196822332848607 RidgeCV alpha 57.72893556778389 Score MSE model sur le jeu d'entrainement 0.3840060702489126 Score RMSE model sur le jeu d'entrainement 0.6196822332848607 Score r2 model sur le jeu d'entrainement 0.7853620557819163
/Users/loicvalenti/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:633: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.3989079669263447, tolerance: 0.4095311363534173 model = cd_fast.enet_coordinate_descent_gram( /Users/loicvalenti/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:633: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.5494511283169459, tolerance: 0.41618779512850673 model = cd_fast.enet_coordinate_descent_gram( /Users/loicvalenti/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:633: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.4660088289822397, tolerance: 0.4058592278707273 model = cd_fast.enet_coordinate_descent_gram(
LassoCV MSE: 0.37135746019834975 LassoCV RMSE: 0.6202286379180469 LassoCV alpha 0.005102451225612807 Score MSE model sur le jeu d'entrainement 0.3846835632936758 Score RMSE model sur le jeu d'entrainement 0.6202286379180469 Score r2 model sur le jeu d'entrainement 0.7837120994096812
ElasticNetCV MSE: 0.370284356861891 ElasticNetCV RMSE: 0.6195805912250039 ElasticNetCV alpha 0.01461856580606787 Score MSE model sur le jeu d'entrainement 0.3838801090227254 Score RMSE model sur le jeu d'entrainement 0.6195805912250039 Score r2 model sur le jeu d'entrainement 0.7853884531715427
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
def print_results_xgboost(estimator, X_train, X_test, y_train, y_test):
estimator.fit(X_train, y_train)
r2 = estimator.score(X_test, y_test)
y_pred = estimator.predict(X_test)
y_train_to_test_overfit = estimator.predict(X_train)
plt.figure()
plt.scatter(estimator.predict(X_test), np.ravel(y_test), s=2)
plt.title("%s (R^2: %.4f)" %
(type(estimator).__name__, r2))
y_train_to_test_overfit = estimator.predict(X_train)
print("MSE: ",
mean_squared_error(np.ravel(y_test), y_pred))
print("RMSE: ",
mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score MSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train))
print("Score RMSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score r2 model sur le jeu d'entrainement", r2_score(
np.ravel(y_train_to_test_overfit), y_train))
print(estimator.best_params_)
#estimator_features = XGBRegressor(np.ravel(estimator.best_params_))
#estimator_features.fit(X_train, y_train)
importance = estimator.best_estimator_.feature_importances_
# summarize feature importance
fig, ax = plt.subplots(figsize=(27, 10), dpi=300)
ax.set_xticks([x for x in range(len(importance))])
ax.set_xticklabels(X_train.columns, rotation=90)
ax.bar(X_train.columns, importance)
plt.show()
return
xgb1 = XGBRegressor()
parameters = {'nthread': [0], # when use hyperthread, xgboost may become slower
'objective': ['reg:squarederror'],
'booster': ['gbtree'],
'learning_rate': [.03, 0.07, 0.5], # so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [2, 4],
'subsample': [0.3, 0.7],
'colsample_bytree': [0.7],
'n_estimators': [100, 200],
'reg_alpha': [0.01, 0.5],
'reg_lambda': [0.01, 0.5]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv=2,
scoring="r2",
verbose=1)
print_results_xgboost(xgb_grid, X_train_emissions, X_test_emissions,
y_train_emissions, y_test_emissions)
Fitting 2 folds for each of 288 candidates, totalling 576 fits
MSE: 0.454921383997785
RMSE: 0.5625229003469768
Score MSE model sur le jeu d'entrainement 0.3164320134147748
Score RMSE model sur le jeu d'entrainement 0.5625229003469768
Score r2 model sur le jeu d'entrainement 0.8192084166169047
{'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 200, 'nthread': 0, 'objective': 'reg:squarederror', 'reg_alpha': 0.01, 'reg_lambda': 0.5, 'subsample': 0.7}
df_emissions = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/df_emissions.csv"
).drop(columns="Unnamed: 0")
y_emissions = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/y_emissions.csv"
).drop(columns="Unnamed: 0")
X_train_emissions, X_test_emissions, y_train_emissions, y_test_emissions = model_selection.train_test_split(
df_emissions.drop(columns=["ENERGYSTARScore"]),
np.ravel(y_emissions),
test_size=0.3, # 30% des données dans le jeu de test
shuffle=True)
for steps in range(4):
np.random.seed(55)
print("### AutoFeat with %i feateng_steps" % steps)
afreg = AutoFeatRegressor(verbose=1,
feateng_steps=steps,
n_jobs=-1,
max_gb=4,
transformations=("exp", "abs", "sqrt", "^2",
"^3")
)
df_train_emissions = afreg.fit_transform(X_train_emissions,
np.ravel(y_train_emissions))
df_test_emissions = afreg.transform(X_test_emissions)
r2 = afreg.score(X_test_emissions, np.ravel(y_test_emissions))
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(X_test_emissions),
np.ravel(y_test_emissions),
s=2)
plt.title("%i FE steps (R^2: %.4f; %i new features)" %
(steps, r2, len(afreg.new_feat_cols_)))
### AutoFeat with 0 feateng_steps [AutoFeat] The 0 step feature engineering process could generate up to 39 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.00 gb of space. [feateng] Warning: no features generated for max_steps < 1. [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 3.7s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 3.7s remaining: 5.6s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 3.8s remaining: 2.5s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 3.8s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 3.8s finished [featsel] 26 features after 5 feature selection runs [featsel] 21 features after correlation filtering [featsel] 19 features after noise filtering [AutoFeat] Final dataframe with 39 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.917961850804405 12.398470 * PropertyGFATotal 2.392225 * T_Hospital 1.564502 * ElectricityUse 1.521923 * T_Laboratory 1.495583 * T_Supermarket / Grocery Store 1.205656 * T_Restaurant -1.148302 * T_Self-Storage Facility 1.125316 * SteamUse 0.814303 * T_Senior Care Community 0.798934 * T_Campus -0.582113 * T_Distribution Center -0.482131 * T_Warehouse 0.428700 * T_Hotel 0.412831 * T_Nonresidential COS 0.331957 * T_Other -0.307554 * T_Worship Facility -0.300025 * T_Multifamily MR (5-9) 0.295714 * BuildingAge -0.265473 * T_Multifamily LR (1-4) [AutoFeat] Final score: 0.7661 ## Final R^2: 0.7831 ### AutoFeat with 1 feateng_steps [AutoFeat] The 1 step feature engineering process could generate up to 195 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.00 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Generated altogether 16 new features in 1 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 1 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.1s [Parallel(n_jobs=-1)]: Batch computation too fast (0.1479s.) Setting batch_size=2. [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.2s remaining: 0.3s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.2s remaining: 0.1s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s finished [featsel] 25 features after 5 feature selection runs [featsel] 22 features after correlation filtering [featsel] 20 features after noise filtering [AutoFeat] Final dataframe with 39 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.877222530507128 12.327526 * PropertyGFATotal 2.415132 * T_Hospital 1.561559 * ElectricityUse 1.533863 * T_Laboratory 1.509466 * T_Supermarket / Grocery Store 1.213580 * T_Restaurant -1.136393 * T_Self-Storage Facility 1.126437 * SteamUse 0.815389 * T_Senior Care Community 0.811797 * T_Campus -0.569410 * T_Distribution Center -0.470043 * T_Warehouse 0.443079 * T_Hotel 0.417035 * T_Nonresidential COS 0.342619 * T_Other 0.299410 * BuildingAge -0.297726 * T_Worship Facility -0.285349 * T_Multifamily MR (5-9) -0.254920 * T_Multifamily LR (1-4) 0.139259 * T_Multifamily HR (10+) [AutoFeat] Final score: 0.7663 ## Final R^2: 0.7827 ### AutoFeat with 2 feateng_steps [AutoFeat] The 2 step feature engineering process could generate up to 19110 features. [AutoFeat] With 2349 data points this new feature matrix would use about 0.18 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 1008 feature combinations from 1485 original feature tuples - done. [feateng] Generated altogether 1497 new features in 2 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 266 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.6s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.6s remaining: 0.9s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.6s remaining: 0.4s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.7s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.7s finished [featsel] 39 features after 5 feature selection runs [featsel] 33 features after correlation filtering [featsel] 23 features after noise filtering [AutoFeat] Computing 8 new features. [AutoFeat] 8/ 8 new features ...done. [AutoFeat] Final dataframe with 47 feature columns (8 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -6.2690191598734994 11.174457 * PropertyGFATotal 2.429075 * BuildingAge**2*T_Restaurant 2.260512 * T_Hospital 2.028300 * BuildingAge*T_SeniorCareCommunity 1.391222 * T_Supermarket / Grocery Store 1.391160 * ElectricityUse -1.143652 * T_Self-Storage Facility 1.102321 * SteamUse*T_MixedUseProperty 1.089497 * SteamUse -1.028265 * BuildingAge**3*T_NonResidential -1.016098 * NaturalGasUse*T_LargeOffice -0.698004 * T_Distribution Center -0.541485 * T_Warehouse 0.525369 * PropertyGFABuildings**3*exp(BuildingAge) 0.445815 * T_Large Office 0.412513 * T_Hotel 0.369489 * sqrt(BuildingAge)*NaturalGasUse 0.368275 * T_Other 0.346198 * T_Nonresidential COS -0.323936 * T_Low-Rise Multifamily -0.300976 * T_Mid-Rise Multifamily 0.284853 * sqrt(BuildingAge)*LargestPropertyUseTypeGFA**3 0.137247 * T_NonResidential [AutoFeat] Final score: 0.7826 [AutoFeat] Computing 8 new features. [AutoFeat] 8/ 8 new features ...done. [AutoFeat] Computing 8 new features. [AutoFeat] 8/ 8 new features ...done. ## Final R^2: 0.7825 [AutoFeat] Computing 8 new features. [AutoFeat] 8/ 8 new features ...done. ### AutoFeat with 3 feateng_steps [AutoFeat] The 3 step feature engineering process could generate up to 321750 features. [AutoFeat] With 2349 data points this new feature matrix would use about 3.02 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 5460 feature combinations from 1485 original feature tuples - done. [feateng] Step 3: transformation of new features [feateng] Generated 17077 transformed features from 5460 original features - done. [feateng] Generated altogether 24016 new features in 3 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 6235 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 28.0s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 29.0s remaining: 43.5s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 29.5s remaining: 19.6s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 31.6s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 31.6s finished [featsel] 55 features after 5 feature selection runs [featsel] 25 features after correlation filtering [featsel] 22 features after noise filtering [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. [AutoFeat] Final dataframe with 60 feature columns (21 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: -4.563061055086486 4.010980 * PropertyGFATotal 2.355103 * (PropertyGFABuildings - T_SelfStorageFacility)**2 -1.796140 * NaturalGasUse**3*PropertyGFATotal**6 1.617912 * (-LargestPropertyUseTypeGFA**2 + T_LowRiseMultifamily)**3 1.470908 * (PropertyGFATotal - T_DistributionCenter)**3 1.284006 * Abs(LargestPropertyUseTypeGFA**2 - T_Warehouse) -1.223215 * (-LargestPropertyUseTypeGFA**2 + T_MidRiseMultifamily)**3 1.113105 * exp(SteamUse - T_LowRiseMultifamily) 1.038640 * Abs(LargestPropertyUseTypeGFA**3 - T_NonresidentialCOS) 0.969134 * (LargestPropertyUseTypeGFA**3 - T_LargeOffice)**2 0.489665 * (ElectricityUse + T_SupermarketGroceryStore)**2 -0.415812 * exp(-ElectricityUse + T_Warehouse) 0.348522 * PropertyGFATotal**3*exp(BuildingAge) 0.289221 * Abs(NaturalGasUse - T_SmallandMidSizedOffice) 0.286923 * (NaturalGasUse + T_Hospital)**3 -0.248546 * exp(NaturalGasUse + SteamUse) 0.237612 * Abs(NaturalGasUse - T_LargeOffice) 0.216468 * (NaturalGasUse + T_SeniorCareCommunity)**2 0.202576 * (NaturalGasUse + T_Restaurant)**3 0.098929 * exp(ElectricityUse + T_Hotel) 0.086081 * (ElectricityUse + T_Campus)**3 0.073293 * exp(ElectricityUse + T_Other) [AutoFeat] Final score: 0.7797 [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. ## Final R^2: 0.7922 [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done.
Sans l'ENERGYSTARScore, l'autofeat n'améliore pas la performance, nous ne l'utilisons pas.
def Test_Sans_ENERGYSTARScore(X_train, X_test, y_train, y_test):
for model in [
RidgeCV(alphas=np.linspace(0.0001, 200, 2000), scoring='r2'),
LassoCV(alphas=np.linspace(0.0001, 10, 2000), max_iter=10000),
ElasticNetCV(l1_ratio=np.linspace(0.0001, 1, 200),
max_iter=2000,
n_alphas=200)
]:
test_model_lineaire_scaled(model, X_train, X_test, y_train, y_test)
xgb1 = XGBRegressor()
parameters = {'nthread': [0], # when use hyperthread, xgboost may become slower
'objective': ['reg:squarederror'],
'booster': ['gbtree'],
'learning_rate': [.03, 0.07, 0.5], # so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [2, 4],
'subsample': [0.3, 0.7],
'colsample_bytree': [0.7],
'n_estimators': [100, 200],
'reg_alpha': [0.01, 0.5],
'reg_lambda': [0.01, 0.5]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv=2,
scoring="r2",
verbose=1)
print_results_xgboost(xgb_grid, X_train, X_test, y_train, y_test)
return
Test_Sans_ENERGYSTARScore(
X_train_emissions, X_test_emissions, y_train_emissions, y_test_emissions)
RidgeCV MSE: 0.5019073032295354 RidgeCV RMSE: 0.711624456827691 RidgeCV alpha 75.3377311655828 Score MSE model sur le jeu d'entrainement 0.5064093675553063 Score RMSE model sur le jeu d'entrainement 0.711624456827691 Score r2 model sur le jeu d'entrainement 0.6956388093335426
LassoCV MSE: 0.5048048715977104 LassoCV RMSE: 0.7114287492101077 LassoCV alpha 0.005102451225612807 Score MSE model sur le jeu d'entrainement 0.5061308652026583 Score RMSE model sur le jeu d'entrainement 0.7114287492101077 Score r2 model sur le jeu d'entrainement 0.6972147993876805
ElasticNetCV MSE: 0.5048389855278422 ElasticNetCV RMSE: 0.7114576027284767 ElasticNetCV alpha 0.005256389170453785 Score MSE model sur le jeu d'entrainement 0.5061719204801509 Score RMSE model sur le jeu d'entrainement 0.7114576027284767 Score r2 model sur le jeu d'entrainement 0.697035806963225
Fitting 2 folds for each of 288 candidates, totalling 576 fits
MSE: 0.47267424470534775
RMSE: 0.5841075805072042
Score MSE model sur le jeu d'entrainement 0.3411816656059801
Score RMSE model sur le jeu d'entrainement 0.5841075805072042
Score r2 model sur le jeu d'entrainement 0.7975338829604479
{'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 200, 'nthread': 0, 'objective': 'reg:squarederror', 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.7}
Après les tests de modèles effectués, nous choississons le modèle Lasso, pour sa performance son bonus de réduction dimensionnelle, qui permet une meilleure interprétabilité.